geo_name_list = c("GSE129166", "GSE34748", "GSE51675", "GSE15296", "GSE46474", "GSE50084")
geo_list = c(getGEO(filename="Data/GSE129166_series_matrix.txt.gz"), getGEO(filename="Data/GSE34748_series_matrix.txt.gz"), getGEO(filename="Data/GSE51675_series_matrix.txt.gz"), getGEO(filename="Data/GSE15296_series_matrix.txt.gz"), getGEO(filename="Data/GSE46474_series_matrix.txt.gz"), getGEO(filename="Data/GSE50084_series_matrix.txt.gz"))
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
for(geo in geo_list) {
print(fData(geo))
print(names(which(colSums(is.na(fData(geo)))>0)))
}
character(0)
character(0)
[1] "GENE" "TIGR_ID"
character(0)
character(0)
character(0)
All gse have a gene symbol column which can be used to match records except for the last one, GSE50084 which has the gene as the 2nd listed element under gene_assignment, and GSE51675 which is missing all of its gene info.
for(geo in geo_list) {
print(pData(geo))
print(names(which(colSums(is.na(pData(geo)))>0)))
}
character(0)
character(0)
character(0)
character(0)
character(0)
character(0)
for (geo in geo_list) {
print(data.frame(t(exprs(geo))))
print(names(which(colSums(is.na(data.frame(t(exprs(geo)))))>0)))
}
character(0)
character(0)
character(0)
character(0)
character(0)
character(0)
for (geo in geo_list) {
df = data.frame(t(exprs(geo)))
boxplot(df[1:100])
}






Plots for GSE51675 and GSE15296 show that they have been transformed somehow. Other than those, the other 4 seem fairly contained between 0-14 expression intensity.
GSE51675 has been discarded due to small sample size, missing info, and transformed expression values. GSE50084 has been discarded
# Contains "GSE129166", "GSE34748"
length(union(fData(geo_list[[1]])["Gene Symbol"], fData(geo_list[[2]])["Gene Symbol"])[[1]])
[1] 23521
# Contains "GSE129166", "GSE46474"
length(union(fData(geo_list[[1]])["Gene Symbol"], fData(geo_list[[5]])["Gene Symbol"])[[1]])
[1] 23521
All 3 have perfect overlap so the datasets recommended for use are “GSE129166”, “GSE34748”, and “GSE46474”
(Mukund): Also just thinking, but we probably do want to go through the effort of trying to clean another dataset to match these. firstly for the sake of increased difficulty in the eyes of Jean, but also sometimes without going through the process it can be easy to overlook something in the data cleaning aspect. I am just a bit worried because after talking to some other people they have all told me the data cleaning for this is really hard so not sure if we are actually missing something.
GSE34748 = getGEO(filename="Data/GSE34748_series_matrix.txt.gz")
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
clinical_GSE34748 = pData(GSE34748)
Also just doing some checking and some of these don’t actually have clinical data
GSE46474 = getGEO(filename="Data/GSE46474_series_matrix.txt.gz")
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
clinical_GSE46474 = pData(GSE46474)
This one has clinical data!!
GSE129166 = getGEO(filename="Data/GSE129166_series_matrix.txt.gz")
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
clinical_GSE129166 = pData(GSE129166)
View(clinical_GSE129166)
Also no clinical data
GSE129166 = getGEO(filename="Data/GSE129166_series_matrix.txt.gz")
Using locally cached version of GPL570 found here:
/var/folders/81/4fc96fh52150fkrp33z0kklh0000gp/T//RtmpZiAUT6/GPL570.soft.gz
Mukund CPOP analysis
z1_pairwise = pairwise_col_diff(z1) %>% as.matrix()
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'as.matrix': vector memory exhausted (limit reached?)
getting the results vectors
---
title: "Liam Research 1"
output: html_notebook
---

```{r, include=FALSE}
library(tidyverse)
library(tuneR)
library(devtools)
library(ggplot2)
library(tsfeatures)
library(class)
library(cvTools)
library(randomForest)
library(GEOquery) 
library(R.utils)
library(reshape2)
library(limma)
library(dplyr)
library(e1071)
library(DT)
library(viridis)
library(plotly)
library(scales)
library(CPOP)
library(matrixStats)
```

```{r}
geo_name_list = c("GSE129166", "GSE34748", "GSE51675", "GSE15296", "GSE46474", "GSE50084")
geo_list = c(getGEO(filename="Data/GSE129166_series_matrix.txt.gz"), getGEO(filename="Data/GSE34748_series_matrix.txt.gz"), getGEO(filename="Data/GSE51675_series_matrix.txt.gz"), getGEO(filename="Data/GSE15296_series_matrix.txt.gz"), getGEO(filename="Data/GSE46474_series_matrix.txt.gz"), getGEO(filename="Data/GSE50084_series_matrix.txt.gz"))
```

```{r}
for(geo in geo_list) {
  print(fData(geo))
  print(names(which(colSums(is.na(fData(geo)))>0)))
}
```
All gse have a gene symbol column which can be used to match records except for the last one, GSE50084 which has the gene as the 2nd listed element under gene_assignment, and GSE51675 which is missing all of its gene info. 


```{r}
for(geo in geo_list) {
  print(pData(geo))
  print(names(which(colSums(is.na(pData(geo)))>0)))
}
```

```{r}
for (geo in geo_list) {
  print(data.frame(t(exprs(geo))))
  print(names(which(colSums(is.na(data.frame(t(exprs(geo)))))>0)))
}
```

```{r}
for (geo in geo_list) {
  df = data.frame(t(exprs(geo)))
  boxplot(df[1:100])
}
```
Plots for GSE51675 and GSE15296 show that they have been transformed somehow. Other than those, the other 4 seem fairly contained between 0-14 expression intensity.

GSE51675 has been discarded due to small sample size, missing info, and transformed expression values. GSE50084 has been discarded



```{r}
# Contains "GSE129166", "GSE34748"
length(union(fData(geo_list[[1]])["Gene Symbol"], fData(geo_list[[2]])["Gene Symbol"])[[1]])
```

```{r}
# Contains "GSE129166", "GSE46474"
length(union(fData(geo_list[[1]])["Gene Symbol"], fData(geo_list[[5]])["Gene Symbol"])[[1]])
```

All 3 have perfect overlap so the datasets recommended for use are "GSE129166", "GSE34748", and "GSE46474"

(Mukund): Also just thinking, but we probably do want to go through the effort of trying to clean another dataset to match these. firstly for the sake of increased difficulty in the eyes of Jean, but also sometimes without going through the process it can be easy to overlook something in the data cleaning aspect. I am just a bit worried because after talking to some other people they have all told me the data cleaning for this is really hard so not sure if we are actually missing something. 

```{r}
GSE34748 = getGEO(filename="Data/GSE34748_series_matrix.txt.gz")
clinical_GSE34748 = pData(GSE34748)

```
Also just doing some checking and some of these don't actually have clinical data

```{r}
GSE46474 = getGEO(filename="Data/GSE46474_series_matrix.txt.gz")
clinical_GSE46474 = pData(GSE46474)

```
This one has clinical data!!
```{r}
GSE129166 = getGEO(filename="Data/GSE129166_series_matrix.txt.gz")
clinical_GSE129166 = pData(GSE129166)

```
Also no clinical data
```{r}
#GSE129166 = getGEO(filename="Data/GSE129166_series_matrix.txt.gz")
#clinical_GSE129166 = pData(GSE129166)

```

## Mukund CPOP analysis 
```{r}
#CPOP data

## keeping only the 100 most variable genes in my data frame 
exp_GSE34748 = (exprs(GSE34748))
Variance = rowVars(as.matrix(exp_GSE34748))
Variance = as.data.frame(Variance)
exp_GSE34748 = as.data.frame(exp_GSE34748)
exp_GSE34748 = cbind(exp_GSE34748, variance = Variance)
exp_GSE34748 = slice_max(exp_GSE34748, order_by = Variance, n = 2000)
exp_GSE34748 = subset(exp_GSE34748, select = -c(Variance))
row_names_exp_GSE34748 = rownames(exp_GSE34748)


exp_GSE46474 = (exprs(GSE46474))
Variance = rowVars(as.matrix(exp_GSE46474))
Variance = as.data.frame(Variance)
exp_GSE46474 = as.data.frame(exp_GSE46474)
exp_GSE46474 = cbind(exp_GSE46474, variance = Variance)
exp_GSE46474 = slice_max(exp_GSE46474, order_by = Variance, n = 2000)
exp_GSE46474 = subset(exp_GSE46474, select = -c(Variance))
row_names_exp_GSE46474 = rownames(exp_GSE46474)


exp_GSE129166 = (exprs(GSE129166))
Variance = rowVars(as.matrix(exp_GSE129166))
Variance = as.data.frame(Variance)
exp_GSE129166 = as.data.frame(exp_GSE129166)
exp_GSE129166 = cbind(exp_GSE129166, variance = Variance)
exp_GSE129166 = slice_max(exp_GSE129166, order_by = Variance, n = 2000)
exp_GSE129166 = subset(exp_GSE129166, select = -c(Variance))
row_names_exp_GSE129166 = rownames(exp_GSE129166)


intersection = intersect(row_names_exp_GSE34748, row_names_exp_GSE46474)
intersection = intersect(intersection, row_names_exp_GSE129166)

exp_GSE34748 = as.data.frame(t(as.matrix(exp_GSE34748)))
exp_GSE34748 = subset(exp_GSE34748, select = c(intersection))

exp_GSE46474 = as.data.frame(t(as.matrix(exp_GSE46474)))
exp_GSE46474 = subset(exp_GSE46474, select = c(intersection))

exp_GSE129166 = as.data.frame(t(as.matrix(exp_GSE129166)))
exp_GSE129166 = subset(exp_GSE129166, select = c(intersection))

GSE34748_id <- data.frame("Dataset" = rep("GSE34748",nrow(exp_GSE34748)))
GSE46474_id <- data.frame("Dataset" = rep("GSE46474",nrow(exp_GSE46474)))
GSE129166_id <- data.frame("Dataset" = rep("GSE129166",nrow(exp_GSE129166)))

z1 = exp_GSE34748 %>% as.matrix()
z2 = exp_GSE46474 %>% as.matrix()
z3 = exp_GSE129166 %>% as.matrix()

## arcsine transformation

exp_GSE34748_arc <- exp_GSE34748
exp_GSE34748_arc = exp_GSE34748_arc / max(exp_GSE34748_arc)
exp_GSE34748_arc = asin(sqrt(exp_GSE34748_arc))

exp_GSE46474_arc <- exp_GSE46474
exp_GSE46474_arc = exp_GSE46474_arc / max(exp_GSE46474_arc)
exp_GSE46474_arc = asin(sqrt(exp_GSE46474_arc))

exp_GSE129166_arc <- exp_GSE129166
exp_GSE129166_arc = exp_GSE129166_arc / max(exp_GSE129166_arc)
exp_GSE129166_arc = asin(sqrt(exp_GSE129166_arc))

z1_pairwise = pairwise_col_diff(z1) %>% as.matrix()
z2_pairwise = pairwise_col_diff(z2) %>% as.matrix()
z3_pairwise = pairwise_col_diff(z3) %>% as.matrix()


z1_arc = pairwise_col_diff(exp_GSE34748_arc) %>% as.matrix()
z2_arc = pairwise_col_diff(exp_GSE46474_arc) %>% as.matrix()
z3_arc = pairwise_col_diff(exp_GSE129166_arc) %>% as.matrix()

## log transform

exp_GSE34748_log <- exp_GSE34748
exp_GSE34748_log = exp_GSE34748_log + 1
exp_GSE34748_log = log(exp_GSE34748_log)

exp_GSE46474_log <- exp_GSE46474
exp_GSE46474_log = exp_GSE46474_log + 1
exp_GSE46474_log = log(exp_GSE46474_log)

exp_GSE129166_log <- exp_GSE129166
exp_GSE129166_log = exp_GSE129166_log + 1
exp_GSE129166_log = log(exp_GSE129166_log)

z1_log = pairwise_col_diff(exp_GSE34748_log) %>% as.matrix()
z2_log = pairwise_col_diff(exp_GSE46474_log) %>% as.matrix()
z3_log = pairwise_col_diff(exp_GSE129166_log) %>% as.matrix()


```

## pre transformation plot
```{r}
box11 = cbind(boxplot_tbl(z1, index = 1), GSE34748_id)
box22 = cbind(boxplot_tbl(z2, index = 1), GSE46474_id)
box33 = cbind(boxplot_tbl(z3, index = 1), GSE129166_id)
box4 = rbind(box11, box22, box33)

expressionplot <-
ggplot(data = box4, aes(x = object, y = means)) +
  geom_point(aes(color = Dataset), size = 0.1) +
  geom_errorbar(aes(ymin = q1,
                    ymax = q3,
                    color = Dataset), size = 0.1,  alpha = 0.2) +
  ggsci::scale_color_d3() +
  theme(axis.ticks = element_blank()) +
  theme(axis.text.x = element_blank()) +
  theme(axis.title.x=element_blank()) +
  theme(axis.title.y=element_blank()) +
  ylim(0,15) + 
  theme(legend.position="bottom") +
  theme(legend.title = element_blank()) +
  labs(title = "Raw Data") +
  theme(plot.title = element_text(size=10))

expressionplot
```

## Boxplot to visualise if the arc transformations were good
```{r}
box1_arc = cbind(boxplot_tbl(z1_arc, index = 1), GSE34748_id)
box2_arc = cbind(boxplot_tbl(z2_arc, index = 1), GSE46474_id)
box3_arc = cbind(boxplot_tbl(z3_arc, index = 1), GSE129166_id)
box4_arc = rbind(box1_arc, box2_arc, box3_arc)

arcplot <-
ggplot(data = box4_arc, aes(x = object, y = means)) +
  geom_point(aes(color = Dataset), size = 0.1) +
  geom_errorbar(aes(ymin = q1,
                    ymax = q3,
                    color = Dataset), size = 0.1,  alpha = 0.2) +
  ggsci::scale_color_d3() +
  theme(axis.ticks = element_blank()) +
  theme(axis.text.x = element_blank()) +
  xlab("Samples") +
  theme(axis.title.y=element_blank()) +
  labs(title = "Arcsine transformation + pairwise difference") +
  theme(plot.title = element_text(size=10))

arcplot
```

## boxplot to see if the log transformation was good
```{r}
box1_log = cbind(boxplot_tbl(z1_log, index = 1), GSE34748_id)
box2_log = cbind(boxplot_tbl(z2_log, index = 1), GSE46474_id)
box3_log = cbind(boxplot_tbl(z3_log, index = 1), GSE129166_id)
box4_log = rbind(box1_log, box2_log, box3_log)

logplot <-
ggplot(data = box4_log, aes(x = object, y = means)) +
  geom_point(aes(color = Dataset), size = 0.1) +
  geom_errorbar(aes(ymin = q1,
                    ymax = q3,
                    color = Dataset), size = 0.1,  alpha = 0.2) +
  ggsci::scale_color_d3() +
  theme(axis.ticks = element_blank()) +
  theme(axis.text.x = element_blank()) +
  xlab("Samples") +
  theme(axis.title.y=element_blank()) +
  labs(title = "Log transformation + pairwise difference") +
  theme(plot.title = element_text(size=10))

logplot
```
## getting the results vectors 
```{r}
pData(GSE46474)
pData(GSE129166) #is there rejection and stable
pData(GSE34748) #no reject or stable



### GSE36059
### GSE48581
# these have reject + stable but categorized in more detail -> either have more groups that we are predicting, or we could do purely binary 
```

